knitr::opts_chunk$set(echo = TRUE)
library(readxl)
## Warning: package 'readxl' was built under R version 3.4.4
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.5
## ✔ tidyr   0.8.1     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Bring in the data!

attraw <- read_excel("~/Desktop/SMU Classes/MSDS 6306/Case Study II/CaseStudy2_2/CaseStudy2-data.xlsx")

View(attraw)
summary(attraw) # No NAs? 1470 lines of data No one under age 18 employee numbers go to 2068 but only 1470 lines. Job satisfaction mean 2.7 median 3.0 
##       Age         Attrition         BusinessTravel       DailyRate     
##  Min.   :18.00   Length:1470        Length:1470        Min.   : 102.0  
##  1st Qu.:30.00   Class :character   Class :character   1st Qu.: 465.0  
##  Median :36.00   Mode  :character   Mode  :character   Median : 802.0  
##  Mean   :36.92                                         Mean   : 802.5  
##  3rd Qu.:43.00                                         3rd Qu.:1157.0  
##  Max.   :60.00                                         Max.   :1499.0  
##   Department        DistanceFromHome   Education     EducationField    
##  Length:1470        Min.   : 1.000   Min.   :1.000   Length:1470       
##  Class :character   1st Qu.: 2.000   1st Qu.:2.000   Class :character  
##  Mode  :character   Median : 7.000   Median :3.000   Mode  :character  
##                     Mean   : 9.193   Mean   :2.913                     
##                     3rd Qu.:14.000   3rd Qu.:4.000                     
##                     Max.   :29.000   Max.   :5.000                     
##  EmployeeCount EmployeeNumber   EnvironmentSatisfaction    Gender         
##  Min.   :1     Min.   :   1.0   Min.   :1.000           Length:1470       
##  1st Qu.:1     1st Qu.: 491.2   1st Qu.:2.000           Class :character  
##  Median :1     Median :1020.5   Median :3.000           Mode  :character  
##  Mean   :1     Mean   :1024.9   Mean   :2.722                             
##  3rd Qu.:1     3rd Qu.:1555.8   3rd Qu.:4.000                             
##  Max.   :1     Max.   :2068.0   Max.   :4.000                             
##    HourlyRate     JobInvolvement    JobLevel       JobRole         
##  Min.   : 30.00   Min.   :1.00   Min.   :1.000   Length:1470       
##  1st Qu.: 48.00   1st Qu.:2.00   1st Qu.:1.000   Class :character  
##  Median : 66.00   Median :3.00   Median :2.000   Mode  :character  
##  Mean   : 65.89   Mean   :2.73   Mean   :2.064                     
##  3rd Qu.: 83.75   3rd Qu.:3.00   3rd Qu.:3.000                     
##  Max.   :100.00   Max.   :4.00   Max.   :5.000                     
##  JobSatisfaction MaritalStatus      MonthlyIncome    MonthlyRate   
##  Min.   :1.000   Length:1470        Min.   : 1009   Min.   : 2094  
##  1st Qu.:2.000   Class :character   1st Qu.: 2911   1st Qu.: 8047  
##  Median :3.000   Mode  :character   Median : 4919   Median :14236  
##  Mean   :2.729                      Mean   : 6503   Mean   :14313  
##  3rd Qu.:4.000                      3rd Qu.: 8379   3rd Qu.:20462  
##  Max.   :4.000                      Max.   :19999   Max.   :26999  
##  NumCompaniesWorked    Over18            OverTime        
##  Min.   :0.000      Length:1470        Length:1470       
##  1st Qu.:1.000      Class :character   Class :character  
##  Median :2.000      Mode  :character   Mode  :character  
##  Mean   :2.693                                           
##  3rd Qu.:4.000                                           
##  Max.   :9.000                                           
##  PercentSalaryHike PerformanceRating RelationshipSatisfaction
##  Min.   :11.00     Min.   :3.000     Min.   :1.000           
##  1st Qu.:12.00     1st Qu.:3.000     1st Qu.:2.000           
##  Median :14.00     Median :3.000     Median :3.000           
##  Mean   :15.21     Mean   :3.154     Mean   :2.712           
##  3rd Qu.:18.00     3rd Qu.:3.000     3rd Qu.:4.000           
##  Max.   :25.00     Max.   :4.000     Max.   :4.000           
##  StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear
##  Min.   :80    Min.   :0.0000   Min.   : 0.00     Min.   :0.000        
##  1st Qu.:80    1st Qu.:0.0000   1st Qu.: 6.00     1st Qu.:2.000        
##  Median :80    Median :1.0000   Median :10.00     Median :3.000        
##  Mean   :80    Mean   :0.7939   Mean   :11.28     Mean   :2.799        
##  3rd Qu.:80    3rd Qu.:1.0000   3rd Qu.:15.00     3rd Qu.:3.000        
##  Max.   :80    Max.   :3.0000   Max.   :40.00     Max.   :6.000        
##  WorkLifeBalance YearsAtCompany   YearsInCurrentRole
##  Min.   :1.000   Min.   : 0.000   Min.   : 0.000    
##  1st Qu.:2.000   1st Qu.: 3.000   1st Qu.: 2.000    
##  Median :3.000   Median : 5.000   Median : 3.000    
##  Mean   :2.761   Mean   : 7.008   Mean   : 4.229    
##  3rd Qu.:3.000   3rd Qu.: 9.000   3rd Qu.: 7.000    
##  Max.   :4.000   Max.   :40.000   Max.   :18.000    
##  YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000          Min.   : 0.000      
##  1st Qu.: 0.000          1st Qu.: 2.000      
##  Median : 1.000          Median : 3.000      
##  Mean   : 2.188          Mean   : 4.123      
##  3rd Qu.: 3.000          3rd Qu.: 7.000      
##  Max.   :15.000          Max.   :17.000

Start the EDA

Graphically look at the data

colnames(attraw)
##  [1] "Age"                      "Attrition"               
##  [3] "BusinessTravel"           "DailyRate"               
##  [5] "Department"               "DistanceFromHome"        
##  [7] "Education"                "EducationField"          
##  [9] "EmployeeCount"            "EmployeeNumber"          
## [11] "EnvironmentSatisfaction"  "Gender"                  
## [13] "HourlyRate"               "JobInvolvement"          
## [15] "JobLevel"                 "JobRole"                 
## [17] "JobSatisfaction"          "MaritalStatus"           
## [19] "MonthlyIncome"            "MonthlyRate"             
## [21] "NumCompaniesWorked"       "Over18"                  
## [23] "OverTime"                 "PercentSalaryHike"       
## [25] "PerformanceRating"        "RelationshipSatisfaction"
## [27] "StandardHours"            "StockOptionLevel"        
## [29] "TotalWorkingYears"        "TrainingTimesLastYear"   
## [31] "WorkLifeBalance"          "YearsAtCompany"          
## [33] "YearsInCurrentRole"       "YearsSinceLastPromotion" 
## [35] "YearsWithCurrManager"
ggplot(data = attraw) + geom_bar(mapping = aes(x = Age))

ggplot(data = attraw) + geom_bar(mapping = aes(x = Attrition, y = ..prop.., group = 1), stat = "count")

ggplot(data = attraw) + geom_bar(mapping = aes(x = BusinessTravel, y = ..prop.., group = 1), stat = "count")

ggplot(data = attraw) + geom_bar(mapping = aes(x = DailyRate), binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

ggplot(data = attraw) + geom_bar(mapping = aes(x = Department, y = ..prop.., group = 1), stat = "count")

ggplot(data = attraw) + geom_bar(mapping = aes(x = DistanceFromHome))

ggplot(data = attraw) + geom_bar(mapping = aes(x = Education))

ggplot(data = attraw) + geom_bar(mapping = aes(x = EducationField, y = ..prop.., group = 1), stat = "count")

ggplot(data = attraw) + geom_bar(mapping = aes(x = EmployeeCount))

ggplot(data = attraw) + geom_bar(mapping = aes(x = EmployeeNumber))

ggplot(data = attraw) + geom_bar(mapping = aes(x = EnvironmentSatisfaction, y = ..prop.., group = 1), stat = "count")

ggplot(data = attraw) + geom_bar(mapping = aes(x = Gender,y = ..prop.., group = 1), stat = "count")

ggplot(data = attraw) + geom_bar(mapping = aes(x = HourlyRate))

ggplot(data = attraw) + geom_bar(mapping = aes(x = JobInvolvement, y = ..prop.., group = 1), stat = "count")

ggplot(data = attraw) + geom_bar(mapping = aes(x = JobLevel, y = ..prop.., group = 1), stat = "count")

ggplot(data = attraw) + geom_bar(mapping = aes(x = JobRole, y = ..prop.., group = 1), stat = "count")

ggplot(data = attraw) + geom_bar(mapping = aes(x = JobSatisfaction, y = ..prop.., group = 1), stat = "count")

ggplot(data = attraw) + geom_bar(mapping = aes(x = MaritalStatus, y = ..prop.., group = 1), stat = "count")

ggplot(data = attraw) + geom_bar(mapping = aes(x = MonthlyIncome), binwidth = 5000)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

ggplot(data = attraw) + geom_bar(mapping = aes(x = MonthlyRate), binwidth = 10000)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

ggplot(data = attraw) + geom_bar(mapping = aes(x = NumCompaniesWorked))

ggplot(data = attraw) + geom_bar(mapping = aes(x = Over18))

ggplot(data = attraw) + geom_bar(mapping = aes(x = OverTime, y = ..prop.., group = 1), stat = "count")

ggplot(data = attraw) + geom_bar(mapping = aes(x = PercentSalaryHike))

ggplot(data = attraw) + geom_bar(mapping = aes(x = PerformanceRating), binwidth = .5)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

ggplot(data = attraw) + geom_bar(mapping = aes(x = RelationshipSatisfaction, y = ..prop.., group = 1), stat = "count")

ggplot(data = attraw) + geom_bar(mapping = aes(x = StandardHours))

ggplot(data = attraw) + geom_bar(mapping = aes(x = StockOptionLevel))

ggplot(data = attraw) + geom_bar(mapping = aes(x = TotalWorkingYears))

ggplot(data = attraw) + geom_bar(mapping = aes(x = TrainingTimesLastYear))

ggplot(data = attraw) + geom_bar(mapping = aes(x = WorkLifeBalance, y = ..prop.., group = 1), stat = "count")

ggplot(data = attraw) + geom_bar(mapping = aes(x = YearsAtCompany))

ggplot(data = attraw) + geom_bar(mapping = aes(x = YearsInCurrentRole))

ggplot(data = attraw) + geom_bar(mapping = aes(x = YearsSinceLastPromotion))

ggplot(data = attraw) + geom_bar(mapping = aes(x = YearsWithCurrManager))

Graphically look at the data more

ggplot(data = attraw) + geom_point(mapping = aes(x = EmployeeNumber, y = YearsAtCompany))

attryes <- subset(attraw, Attrition =="Yes", select=Age:YearsWithCurrManager)
attrno <- subset(attraw, Attrition =="No", select=Age:YearsWithCurrManager)

summary(attryes)
##       Age         Attrition         BusinessTravel       DailyRate     
##  Min.   :18.00   Length:237         Length:237         Min.   : 103.0  
##  1st Qu.:28.00   Class :character   Class :character   1st Qu.: 408.0  
##  Median :32.00   Mode  :character   Mode  :character   Median : 699.0  
##  Mean   :33.61                                         Mean   : 750.4  
##  3rd Qu.:39.00                                         3rd Qu.:1092.0  
##  Max.   :58.00                                         Max.   :1496.0  
##   Department        DistanceFromHome   Education    EducationField    
##  Length:237         Min.   : 1.00    Min.   :1.00   Length:237        
##  Class :character   1st Qu.: 3.00    1st Qu.:2.00   Class :character  
##  Mode  :character   Median : 9.00    Median :3.00   Mode  :character  
##                     Mean   :10.63    Mean   :2.84                     
##                     3rd Qu.:17.00    3rd Qu.:4.00                     
##                     Max.   :29.00    Max.   :5.00                     
##  EmployeeCount EmployeeNumber EnvironmentSatisfaction    Gender         
##  Min.   :1     Min.   :   1   Min.   :1.000           Length:237        
##  1st Qu.:1     1st Qu.: 514   1st Qu.:1.000           Class :character  
##  Median :1     Median :1017   Median :3.000           Mode  :character  
##  Mean   :1     Mean   :1010   Mean   :2.464                             
##  3rd Qu.:1     3rd Qu.:1486   3rd Qu.:4.000                             
##  Max.   :1     Max.   :2055   Max.   :4.000                             
##    HourlyRate     JobInvolvement     JobLevel       JobRole         
##  Min.   : 31.00   Min.   :1.000   Min.   :1.000   Length:237        
##  1st Qu.: 50.00   1st Qu.:2.000   1st Qu.:1.000   Class :character  
##  Median : 66.00   Median :3.000   Median :1.000   Mode  :character  
##  Mean   : 65.57   Mean   :2.519   Mean   :1.637                     
##  3rd Qu.: 84.00   3rd Qu.:3.000   3rd Qu.:2.000                     
##  Max.   :100.00   Max.   :4.000   Max.   :5.000                     
##  JobSatisfaction MaritalStatus      MonthlyIncome    MonthlyRate   
##  Min.   :1.000   Length:237         Min.   : 1009   Min.   : 2326  
##  1st Qu.:1.000   Class :character   1st Qu.: 2373   1st Qu.: 8870  
##  Median :3.000   Mode  :character   Median : 3202   Median :14618  
##  Mean   :2.468                      Mean   : 4787   Mean   :14559  
##  3rd Qu.:3.000                      3rd Qu.: 5916   3rd Qu.:21081  
##  Max.   :4.000                      Max.   :19859   Max.   :26999  
##  NumCompaniesWorked    Over18            OverTime        
##  Min.   :0.000      Length:237         Length:237        
##  1st Qu.:1.000      Class :character   Class :character  
##  Median :1.000      Mode  :character   Mode  :character  
##  Mean   :2.941                                           
##  3rd Qu.:5.000                                           
##  Max.   :9.000                                           
##  PercentSalaryHike PerformanceRating RelationshipSatisfaction
##  Min.   :11.0      Min.   :3.000     Min.   :1.000           
##  1st Qu.:12.0      1st Qu.:3.000     1st Qu.:2.000           
##  Median :14.0      Median :3.000     Median :3.000           
##  Mean   :15.1      Mean   :3.156     Mean   :2.599           
##  3rd Qu.:17.0      3rd Qu.:3.000     3rd Qu.:4.000           
##  Max.   :25.0      Max.   :4.000     Max.   :4.000           
##  StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear
##  Min.   :80    Min.   :0.0000   Min.   : 0.000    Min.   :0.000        
##  1st Qu.:80    1st Qu.:0.0000   1st Qu.: 3.000    1st Qu.:2.000        
##  Median :80    Median :0.0000   Median : 7.000    Median :2.000        
##  Mean   :80    Mean   :0.5274   Mean   : 8.245    Mean   :2.624        
##  3rd Qu.:80    3rd Qu.:1.0000   3rd Qu.:10.000    3rd Qu.:3.000        
##  Max.   :80    Max.   :3.0000   Max.   :40.000    Max.   :6.000        
##  WorkLifeBalance YearsAtCompany   YearsInCurrentRole
##  Min.   :1.000   Min.   : 0.000   Min.   : 0.000    
##  1st Qu.:2.000   1st Qu.: 1.000   1st Qu.: 0.000    
##  Median :3.000   Median : 3.000   Median : 2.000    
##  Mean   :2.658   Mean   : 5.131   Mean   : 2.903    
##  3rd Qu.:3.000   3rd Qu.: 7.000   3rd Qu.: 4.000    
##  Max.   :4.000   Max.   :40.000   Max.   :15.000    
##  YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000          Min.   : 0.000      
##  1st Qu.: 0.000          1st Qu.: 0.000      
##  Median : 1.000          Median : 2.000      
##  Mean   : 1.945          Mean   : 2.852      
##  3rd Qu.: 2.000          3rd Qu.: 5.000      
##  Max.   :15.000          Max.   :14.000
summary(attrno)
##       Age         Attrition         BusinessTravel       DailyRate     
##  Min.   :18.00   Length:1233        Length:1233        Min.   : 102.0  
##  1st Qu.:31.00   Class :character   Class :character   1st Qu.: 477.0  
##  Median :36.00   Mode  :character   Mode  :character   Median : 817.0  
##  Mean   :37.56                                         Mean   : 812.5  
##  3rd Qu.:43.00                                         3rd Qu.:1176.0  
##  Max.   :60.00                                         Max.   :1499.0  
##   Department        DistanceFromHome   Education     EducationField    
##  Length:1233        Min.   : 1.000   Min.   :1.000   Length:1233       
##  Class :character   1st Qu.: 2.000   1st Qu.:2.000   Class :character  
##  Mode  :character   Median : 7.000   Median :3.000   Mode  :character  
##                     Mean   : 8.916   Mean   :2.927                     
##                     3rd Qu.:13.000   3rd Qu.:4.000                     
##                     Max.   :29.000   Max.   :5.000                     
##  EmployeeCount EmployeeNumber EnvironmentSatisfaction    Gender         
##  Min.   :1     Min.   :   2   Min.   :1.000           Length:1233       
##  1st Qu.:1     1st Qu.: 483   1st Qu.:2.000           Class :character  
##  Median :1     Median :1022   Median :3.000           Mode  :character  
##  Mean   :1     Mean   :1028   Mean   :2.771                             
##  3rd Qu.:1     3rd Qu.:1574   3rd Qu.:4.000                             
##  Max.   :1     Max.   :2068   Max.   :4.000                             
##    HourlyRate     JobInvolvement    JobLevel       JobRole         
##  Min.   : 30.00   Min.   :1.00   Min.   :1.000   Length:1233       
##  1st Qu.: 48.00   1st Qu.:2.00   1st Qu.:1.000   Class :character  
##  Median : 66.00   Median :3.00   Median :2.000   Mode  :character  
##  Mean   : 65.95   Mean   :2.77   Mean   :2.146                     
##  3rd Qu.: 83.00   3rd Qu.:3.00   3rd Qu.:3.000                     
##  Max.   :100.00   Max.   :4.00   Max.   :5.000                     
##  JobSatisfaction MaritalStatus      MonthlyIncome    MonthlyRate   
##  Min.   :1.000   Length:1233        Min.   : 1051   Min.   : 2094  
##  1st Qu.:2.000   Class :character   1st Qu.: 3211   1st Qu.: 7973  
##  Median :3.000   Mode  :character   Median : 5204   Median :14120  
##  Mean   :2.779                      Mean   : 6833   Mean   :14266  
##  3rd Qu.:4.000                      3rd Qu.: 8834   3rd Qu.:20364  
##  Max.   :4.000                      Max.   :19999   Max.   :26997  
##  NumCompaniesWorked    Over18            OverTime        
##  Min.   :0.000      Length:1233        Length:1233       
##  1st Qu.:1.000      Class :character   Class :character  
##  Median :2.000      Mode  :character   Mode  :character  
##  Mean   :2.646                                           
##  3rd Qu.:4.000                                           
##  Max.   :9.000                                           
##  PercentSalaryHike PerformanceRating RelationshipSatisfaction
##  Min.   :11.00     Min.   :3.000     Min.   :1.000           
##  1st Qu.:12.00     1st Qu.:3.000     1st Qu.:2.000           
##  Median :14.00     Median :3.000     Median :3.000           
##  Mean   :15.23     Mean   :3.153     Mean   :2.734           
##  3rd Qu.:18.00     3rd Qu.:3.000     3rd Qu.:4.000           
##  Max.   :25.00     Max.   :4.000     Max.   :4.000           
##  StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear
##  Min.   :80    Min.   :0.0000   Min.   : 0.00     Min.   :0.000        
##  1st Qu.:80    1st Qu.:0.0000   1st Qu.: 6.00     1st Qu.:2.000        
##  Median :80    Median :1.0000   Median :10.00     Median :3.000        
##  Mean   :80    Mean   :0.8451   Mean   :11.86     Mean   :2.833        
##  3rd Qu.:80    3rd Qu.:1.0000   3rd Qu.:16.00     3rd Qu.:3.000        
##  Max.   :80    Max.   :3.0000   Max.   :38.00     Max.   :6.000        
##  WorkLifeBalance YearsAtCompany   YearsInCurrentRole
##  Min.   :1.000   Min.   : 0.000   Min.   : 0.000    
##  1st Qu.:2.000   1st Qu.: 3.000   1st Qu.: 2.000    
##  Median :3.000   Median : 6.000   Median : 3.000    
##  Mean   :2.781   Mean   : 7.369   Mean   : 4.484    
##  3rd Qu.:3.000   3rd Qu.:10.000   3rd Qu.: 7.000    
##  Max.   :4.000   Max.   :37.000   Max.   :18.000    
##  YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000          Min.   : 0.000      
##  1st Qu.: 0.000          1st Qu.: 2.000      
##  Median : 1.000          Median : 3.000      
##  Mean   : 2.234          Mean   : 4.367      
##  3rd Qu.: 3.000          3rd Qu.: 7.000      
##  Max.   :15.000          Max.   :17.000
library(miscTools)

Mndat <- (colMeans(attraw[ , c("Age", "DailyRate", "DistanceFromHome",     "Education"        , "EnvironmentSatisfaction" , "HourlyRate",           "JobInvolvement"   ,        "JobLevel"      , "JobSatisfaction" ,  "MonthlyIncome",      "MonthlyRate" , "NumCompaniesWorked",  "PercentSalaryHike"   , "PerformanceRating"   ,     "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears"  ,      "TrainingTimesLastYear"  , "WorkLifeBalance"    ,     "YearsAtCompany"  , "YearsInCurrentRole"   ,  "YearsSinceLastPromotion" , "YearsWithCurrManager" )]))
Mnyes <- (colMeans(attryes[ , c("Age", "DailyRate", "DistanceFromHome",     "Education"        , "EnvironmentSatisfaction" , "HourlyRate",           "JobInvolvement"   ,        "JobLevel"      , "JobSatisfaction" ,  "MonthlyIncome",      "MonthlyRate" , "NumCompaniesWorked",  "PercentSalaryHike"   , "PerformanceRating"   ,     "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears"  ,      "TrainingTimesLastYear"  , "WorkLifeBalance"    ,     "YearsAtCompany"  , "YearsInCurrentRole"   ,  "YearsSinceLastPromotion" , "YearsWithCurrManager" )]))
Mnno <- (colMeans(attrno[ , c("Age", "DailyRate", "DistanceFromHome",     "Education"        , "EnvironmentSatisfaction" , "HourlyRate",           "JobInvolvement"   ,        "JobLevel"      , "JobSatisfaction" ,  "MonthlyIncome",      "MonthlyRate" , "NumCompaniesWorked",  "PercentSalaryHike"   , "PerformanceRating"   ,     "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears"  ,      "TrainingTimesLastYear"  , "WorkLifeBalance"    ,     "YearsAtCompany"  , "YearsInCurrentRole"   ,  "YearsSinceLastPromotion" , "YearsWithCurrManager" )]))

meandf <- data_frame(Mndat, Mnyes, Mnno)
tmeandf <- t(meandf)

Medat <- (colMedians(attraw[ ,c("Age", "DailyRate", "DistanceFromHome",     "Education"        , "EnvironmentSatisfaction" , "HourlyRate",           "JobInvolvement"   ,        "JobLevel"      , "JobSatisfaction" ,  "MonthlyIncome",      "MonthlyRate" , "NumCompaniesWorked",  "PercentSalaryHike"   , "PerformanceRating"   ,     "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears"  ,      "TrainingTimesLastYear"  , "WorkLifeBalance"    ,     "YearsAtCompany"  , "YearsInCurrentRole"   ,  "YearsSinceLastPromotion" , "YearsWithCurrManager" )]))

Meyes <- (colMedians(attryes[ ,c("Age", "DailyRate", "DistanceFromHome",     "Education"        , "EnvironmentSatisfaction" , "HourlyRate",           "JobInvolvement"   ,        "JobLevel"      , "JobSatisfaction" ,  "MonthlyIncome",      "MonthlyRate" , "NumCompaniesWorked",  "PercentSalaryHike"   , "PerformanceRating"   ,     "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears"  ,      "TrainingTimesLastYear"  , "WorkLifeBalance"    ,     "YearsAtCompany"  , "YearsInCurrentRole"   ,  "YearsSinceLastPromotion" , "YearsWithCurrManager" )]))

Meno <- (colMedians(attrno[ ,c("Age", "DailyRate", "DistanceFromHome",     "Education"        , "EnvironmentSatisfaction" , "HourlyRate",           "JobInvolvement"   ,        "JobLevel"      , "JobSatisfaction" ,  "MonthlyIncome",      "MonthlyRate" , "NumCompaniesWorked",  "PercentSalaryHike"   , "PerformanceRating"   ,     "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears"  ,      "TrainingTimesLastYear"  , "WorkLifeBalance"    ,     "YearsAtCompany"  , "YearsInCurrentRole"   ,  "YearsSinceLastPromotion" , "YearsWithCurrManager" )]))

meaddf <- data_frame(Medat, Meyes, Meno)

meandf1 <- add_column(meandf, Name = c("Age", "DailyRate", "DistanceFromHome",     "Education"        , "EnvironmentSatisfaction" , "HourlyRate",           "JobInvolvement"   ,        "JobLevel"      , "JobSatisfaction" ,  "MonthlyIncome",      "MonthlyRate" , "NumCompaniesWorked",  "PercentSalaryHike"   , "PerformanceRating"   ,     "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears"  ,      "TrainingTimesLastYear"  , "WorkLifeBalance"    ,     "YearsAtCompany"  , "YearsInCurrentRole"   ,  "YearsSinceLastPromotion" , "YearsWithCurrManager" ), .before = 1)

meaddf1 <- add_column(meaddf, Name = c("Age", "DailyRate", "DistanceFromHome",     "Education"        , "EnvironmentSatisfaction" , "HourlyRate",           "JobInvolvement"   ,        "JobLevel"      , "JobSatisfaction" ,  "MonthlyIncome",      "MonthlyRate" , "NumCompaniesWorked",  "PercentSalaryHike"   , "PerformanceRating"   ,     "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears"  ,      "TrainingTimesLastYear"  , "WorkLifeBalance"    ,     "YearsAtCompany"  , "YearsInCurrentRole"   ,  "YearsSinceLastPromotion" , "YearsWithCurrManager" ), .before = 1)

totaldf <- merge(meandf1, meaddf1, by="Name")
colnames(totaldf) <- c("FactorName", "Mean_original", "Mean_Yes", "Mean_No", "Median_original", "Median_Yes", "Median_No")

# Compare means of columns of the raw data vs noattrition vs yesattrition Items of note... Variables that seem to have little value. Employee number(doesnt seem to correlate to longer time with the company), Employee count(all =1), Standard hours(all = 80), Over 18 (all=Y). Signifigant variables needing more insight. Age mean for all=36.9 yes=33.6 Mean for no= 37.5 Median and mean close for all sets. DailyRate mean for all= 802 yes=750.36 Mean for no= 812 Median is lower by 50 in Yesattrition. Job level mean for all= 2.06 yes=1.63 Mean for no= 2.14 median for Yes is 1 much lower than others. Monthly Income mean for all=6502.93 yes=4787.93 Mean for no= 6832.74 about 1500 diffrence between all medians and means. Stock Optionlevel mean for all=.793 yes=527 Mean for no= .845 Median for all and No = 1 Yes is much lower with a median of 0. Total working years mean for all=11.27 yes=8.24 Mean for no= 11.86 Median and mean close for all sets. Years at company mean for all=7.00 yes=5.13 Mean for no= 7.36 Big jump of 40-60% from Median to Mean in all data sets. Years in current role mean for all=4.22 yes=2.90 Mean for no= 4.48 Jump of about 50% in all data sets. YearsSinceLast Promo mean for all= 2.18 yes=1.94 Mean for no= 2.23 Big jumps of 94 to 118% in all data sets. Years with curr Manager mean for all=4.12  yes=2.85 Mean for no= 4.36 Jumps about 40% between median to mean.Still need to look closer at categorical data.

Attrition only

ggplot(data = attryes) + geom_bar(mapping = aes(x = Age))

ggplot(data = attryes) + geom_bar(mapping = aes(x = Attrition, y = ..prop.., group = 1), stat = "count")

ggplot(data = attryes) + geom_bar(mapping = aes(x = BusinessTravel, y = ..prop.., group = 1), stat = "count")

ggplot(data = attryes) + geom_bar(mapping = aes(x = DailyRate), binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

ggplot(data = attryes) + geom_bar(mapping = aes(x = Department, y = ..prop.., group = 1), stat = "count")

ggplot(data = attryes) + geom_bar(mapping = aes(x = DistanceFromHome))

ggplot(data = attryes) + geom_bar(mapping = aes(x = Education))

ggplot(data = attryes) + geom_bar(mapping = aes(x = EducationField, y = ..prop.., group = 1), stat = "count")

ggplot(data = attryes) + geom_bar(mapping = aes(x = EmployeeCount))

ggplot(data = attryes) + geom_bar(mapping = aes(x = EmployeeNumber))

ggplot(data = attryes) + geom_bar(mapping = aes(x = EnvironmentSatisfaction, y = ..prop.., group = 1), stat = "count")

ggplot(data = attryes) + geom_bar(mapping = aes(x = Gender, y = ..prop.., group = 1), stat = "count")

ggplot(data = attryes) + geom_bar(mapping = aes(x = HourlyRate))

ggplot(data = attryes) + geom_bar(mapping = aes(x = JobInvolvement, y = ..prop.., group = 1), stat = "count")

ggplot(data = attryes) + geom_bar(mapping = aes(x = JobLevel, y = ..prop.., group = 1), stat = "count")

ggplot(data = attryes) + geom_bar(mapping = aes(x = JobRole, y = ..prop.., group = 1), stat = "count")

ggplot(data = attryes) + geom_bar(mapping = aes(x = JobSatisfaction, y = ..prop.., group = 1), stat = "count")

ggplot(data = attryes) + geom_bar(mapping = aes(x = MaritalStatus, y = ..prop.., group = 1), stat = "count")

ggplot(data = attryes) + geom_bar(mapping = aes(x = MonthlyIncome), binwidth = 5000)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

ggplot(data = attryes) + geom_bar(mapping = aes(x = MonthlyRate), binwidth = 10000)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

ggplot(data = attryes) + geom_bar(mapping = aes(x = NumCompaniesWorked))

ggplot(data = attryes) + geom_bar(mapping = aes(x = Over18))

ggplot(data = attryes) + geom_bar(mapping = aes(x = OverTime, y = ..prop.., group = 1), stat = "count")

ggplot(data = attryes) + geom_bar(mapping = aes(x = PercentSalaryHike))

ggplot(data = attryes) + geom_bar(mapping = aes(x = PerformanceRating), binwidth = .5)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

ggplot(data = attryes) + geom_bar(mapping = aes(x = RelationshipSatisfaction, y = ..prop.., group = 1), stat = "count")

ggplot(data = attryes) + geom_bar(mapping = aes(x = StandardHours), binwidth = 1)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

ggplot(data = attryes) + geom_bar(mapping = aes(x = StockOptionLevel))

ggplot(data = attryes) + geom_bar(mapping = aes(x = TotalWorkingYears))

ggplot(data = attryes) + geom_bar(mapping = aes(x = TrainingTimesLastYear))

ggplot(data = attryes) + geom_bar(mapping = aes(x = WorkLifeBalance, y = ..prop.., group = 1), stat = "count")

ggplot(data = attryes) + geom_bar(mapping = aes(x = YearsAtCompany))

ggplot(data = attryes) + geom_bar(mapping = aes(x = YearsInCurrentRole))

ggplot(data = attryes) + geom_bar(mapping = aes(x = YearsSinceLastPromotion))

ggplot(data = attryes) + geom_bar(mapping = aes(x = YearsWithCurrManager))

No attrition

ggplot(data = attrno) + geom_bar(mapping = aes(x = Age))

ggplot(data = attrno) + geom_bar(mapping = aes(x = Attrition, y = ..prop.., group = 1), stat = "count")

ggplot(data = attrno) + geom_bar(mapping = aes(x = BusinessTravel, y = ..prop.., group = 1), stat = "count")

ggplot(data = attrno) + geom_bar(mapping = aes(x = DailyRate), binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

ggplot(data = attrno) + geom_bar(mapping = aes(x = Department, y = ..prop.., group = 1), stat = "count")

ggplot(data = attrno) + geom_bar(mapping = aes(x = DistanceFromHome))

ggplot(data = attrno) + geom_bar(mapping = aes(x = Education))

ggplot(data = attrno) + geom_bar(mapping = aes(x = EducationField, y = ..prop.., group = 1), stat = "count")

ggplot(data = attrno) + geom_bar(mapping = aes(x = EmployeeCount))

ggplot(data = attrno) + geom_bar(mapping = aes(x = EmployeeNumber))

ggplot(data = attrno) + geom_bar(mapping = aes(x = EnvironmentSatisfaction, y = ..prop.., group = 1), stat = "count")

ggplot(data = attrno) + geom_bar(mapping = aes(x = Gender, y = ..prop.., group = 1), stat = "count")

ggplot(data = attrno) + geom_bar(mapping = aes(x = HourlyRate))

ggplot(data = attrno) + geom_bar(mapping = aes(x = JobInvolvement, y = ..prop.., group = 1), stat = "count")

ggplot(data = attrno) + geom_bar(mapping = aes(x = JobLevel, y = ..prop.., group = 1), stat = "count")

ggplot(data = attrno) + geom_bar(mapping = aes(x = JobRole, y = ..prop.., group = 1), stat = "count")

ggplot(data = attrno) + geom_bar(mapping = aes(x = JobSatisfaction, y = ..prop.., group = 1), stat = "count")

ggplot(data = attrno) + geom_bar(mapping = aes(x = MaritalStatus, y = ..prop.., group = 1), stat = "count")

ggplot(data = attrno) + geom_bar(mapping = aes(x = MonthlyIncome), binwidth = 5000)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

ggplot(data = attrno) + geom_bar(mapping = aes(x = MonthlyRate), binwidth = 10000)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

ggplot(data = attrno) + geom_bar(mapping = aes(x = NumCompaniesWorked))

ggplot(data = attrno) + geom_bar(mapping = aes(x = Over18))

ggplot(data = attrno) + geom_bar(mapping = aes(x = OverTime, y = ..prop.., group = 1), stat = "count")

ggplot(data = attrno) + geom_bar(mapping = aes(x = PercentSalaryHike))

ggplot(data = attrno) + geom_bar(mapping = aes(x = PerformanceRating), binwidth = .5)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

ggplot(data = attrno) + geom_bar(mapping = aes(x = RelationshipSatisfaction, y = ..prop.., group = 1), stat = "count")

ggplot(data = attrno) + geom_bar(mapping = aes(x = StandardHours))

ggplot(data = attrno) + geom_bar(mapping = aes(x = StockOptionLevel))

ggplot(data = attrno) + geom_bar(mapping = aes(x = TotalWorkingYears))

ggplot(data = attrno) + geom_bar(mapping = aes(x = TrainingTimesLastYear))

ggplot(data = attrno) + geom_bar(mapping = aes(x = WorkLifeBalance, y = ..prop.., group = 1), stat = "count")

ggplot(data = attrno) + geom_bar(mapping = aes(x = YearsAtCompany))

ggplot(data = attrno) + geom_bar(mapping = aes(x = YearsInCurrentRole))

ggplot(data = attrno) + geom_bar(mapping = aes(x = YearsSinceLastPromotion))

ggplot(data = attrno) + geom_bar(mapping = aes(x = YearsWithCurrManager))

ggplot(data = attraw, mapping = aes(x = Age, y = MonthlyIncome)) + geom_point()

ggplot(data = attraw, mapping = aes(x = DailyRate, y = DistanceFromHome )) + geom_point()

ggplot(data = attraw, mapping = aes(x = YearsAtCompany, y = HourlyRate )) + geom_point()

ggplot(data = attraw, mapping = aes(x = NumCompaniesWorked, y = MonthlyIncome )) + geom_point()

ggplot(data = attraw, mapping = aes(x = TotalWorkingYears, y = PercentSalaryHike)) + geom_point()

ggplot(data = attraw, mapping = aes(x = YearsAtCompany, y = MonthlyIncome )) + geom_point()

ggplot(data = attraw, mapping = aes(x = YearsAtCompany, y = YearsSinceLastPromotion )) + geom_point()

ggplot(data = attraw, mapping = aes(x = DailyRate, y = MonthlyRate)) + geom_point()

ggplot(data = attraw, mapping = aes(x = Age, y = YearsSinceLastPromotion)) + geom_point()

ggplot(data = attraw, mapping = aes(x = StockOptionLevel, y = YearsSinceLastPromotion)) + geom_point()

ggplot(data = attraw, mapping = aes(x = Age, y = TotalWorkingYears)) + geom_point()

ggplot(data = attraw, mapping = aes(x = YearsInCurrentRole, y = MonthlyIncome)) + geom_point()

ggplot(data = attraw, mapping = aes(x = YearsSinceLastPromotion, y = YearsWithCurrManager)) + geom_point()

Multiple regression

# just for kicks
attrnewy <- attryes
attrnewn <- attrno
attrnewr <- attraw

attcoly <- ifelse(attrnewy$Attrition == "Yes", "1", NA)
attcoln <- ifelse(attrnewn$Attrition == "No", "1", NA)
attcolr<- ifelse(attrnewr$Attrition == "Yes", "1", "0")

attrnewy$Attrition <- attcoly
attrnewn$Attrition <- attcoln
attrnewr$Attrition <- attcolr


lm(formula = Attrition ~ Age + DailyRate + DistanceFromHome+ Education + EnvironmentSatisfaction + HourlyRate + JobInvolvement + JobLevel + JobSatisfaction + MonthlyIncome + MonthlyRate + NumCompaniesWorked + PercentSalaryHike + PerformanceRating + RelationshipSatisfaction + StockOptionLevel + TotalWorkingYears + TrainingTimesLastYear + WorkLifeBalance + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager, data = attrnewy )
## 
## Call:
## lm(formula = Attrition ~ Age + DailyRate + DistanceFromHome + 
##     Education + EnvironmentSatisfaction + HourlyRate + JobInvolvement + 
##     JobLevel + JobSatisfaction + MonthlyIncome + MonthlyRate + 
##     NumCompaniesWorked + PercentSalaryHike + PerformanceRating + 
##     RelationshipSatisfaction + StockOptionLevel + TotalWorkingYears + 
##     TrainingTimesLastYear + WorkLifeBalance + YearsAtCompany + 
##     YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager, 
##     data = attrnewy)
## 
## Coefficients:
##              (Intercept)                       Age  
##                1.000e+00                 2.858e-17  
##                DailyRate          DistanceFromHome  
##                1.870e-19                -2.012e-17  
##                Education   EnvironmentSatisfaction  
##               -2.364e-16                -2.360e-17  
##               HourlyRate            JobInvolvement  
##                1.556e-17                 2.657e-16  
##                 JobLevel           JobSatisfaction  
##                2.083e-16                 3.037e-16  
##            MonthlyIncome               MonthlyRate  
##               -4.951e-21                 2.335e-20  
##       NumCompaniesWorked         PercentSalaryHike  
##                2.489e-16                -1.179e-16  
##        PerformanceRating  RelationshipSatisfaction  
##                8.658e-16                -4.273e-16  
##         StockOptionLevel         TotalWorkingYears  
##               -1.996e-16                -1.253e-16  
##    TrainingTimesLastYear           WorkLifeBalance  
##               -4.047e-16                -5.555e-16  
##           YearsAtCompany        YearsInCurrentRole  
##                8.261e-17                 3.067e-17  
##  YearsSinceLastPromotion      YearsWithCurrManager  
##               -1.783e-16                 1.370e-16
lm(formula = Attrition ~ Age + DailyRate + DistanceFromHome+ Education + EnvironmentSatisfaction + HourlyRate + JobInvolvement + JobLevel + JobSatisfaction + MonthlyIncome + MonthlyRate + NumCompaniesWorked + PercentSalaryHike + PerformanceRating + RelationshipSatisfaction + StockOptionLevel + TotalWorkingYears + TrainingTimesLastYear + WorkLifeBalance + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager, data = attrnewn )
## 
## Call:
## lm(formula = Attrition ~ Age + DailyRate + DistanceFromHome + 
##     Education + EnvironmentSatisfaction + HourlyRate + JobInvolvement + 
##     JobLevel + JobSatisfaction + MonthlyIncome + MonthlyRate + 
##     NumCompaniesWorked + PercentSalaryHike + PerformanceRating + 
##     RelationshipSatisfaction + StockOptionLevel + TotalWorkingYears + 
##     TrainingTimesLastYear + WorkLifeBalance + YearsAtCompany + 
##     YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager, 
##     data = attrnewn)
## 
## Coefficients:
##              (Intercept)                       Age  
##                1.000e+00                 1.320e-16  
##                DailyRate          DistanceFromHome  
##               -1.064e-18                -7.623e-18  
##                Education   EnvironmentSatisfaction  
##               -6.703e-16                 4.264e-17  
##               HourlyRate            JobInvolvement  
##               -2.611e-18                -5.119e-16  
##                 JobLevel           JobSatisfaction  
##                5.044e-16                -2.218e-16  
##            MonthlyIncome               MonthlyRate  
##               -1.121e-19                 6.625e-20  
##       NumCompaniesWorked         PercentSalaryHike  
##               -7.704e-17                 8.983e-17  
##        PerformanceRating  RelationshipSatisfaction  
##                1.483e-15                 4.010e-16  
##         StockOptionLevel         TotalWorkingYears  
##                1.077e-16                -1.272e-16  
##    TrainingTimesLastYear           WorkLifeBalance  
##                1.435e-18                 1.197e-16  
##           YearsAtCompany        YearsInCurrentRole  
##                2.890e-17                 8.767e-17  
##  YearsSinceLastPromotion      YearsWithCurrManager  
##               -1.480e-16                 8.878e-17
lm(formula = Attrition ~ Age + DailyRate + DistanceFromHome+ Education + EnvironmentSatisfaction + HourlyRate + JobInvolvement + JobLevel + JobSatisfaction + MonthlyIncome + MonthlyRate + NumCompaniesWorked + PercentSalaryHike + PerformanceRating + RelationshipSatisfaction + StockOptionLevel + TotalWorkingYears + TrainingTimesLastYear + WorkLifeBalance + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager, data = attrnewr )
## 
## Call:
## lm(formula = Attrition ~ Age + DailyRate + DistanceFromHome + 
##     Education + EnvironmentSatisfaction + HourlyRate + JobInvolvement + 
##     JobLevel + JobSatisfaction + MonthlyIncome + MonthlyRate + 
##     NumCompaniesWorked + PercentSalaryHike + PerformanceRating + 
##     RelationshipSatisfaction + StockOptionLevel + TotalWorkingYears + 
##     TrainingTimesLastYear + WorkLifeBalance + YearsAtCompany + 
##     YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager, 
##     data = attrnewr)
## 
## Coefficients:
##              (Intercept)                       Age  
##                9.322e-01                -3.857e-03  
##                DailyRate          DistanceFromHome  
##               -3.351e-05                 3.859e-03  
##                Education   EnvironmentSatisfaction  
##               -9.891e-04                -3.510e-02  
##               HourlyRate            JobInvolvement  
##               -2.341e-04                -6.199e-02  
##                 JobLevel           JobSatisfaction  
##               -2.366e-02                -3.470e-02  
##            MonthlyIncome               MonthlyRate  
##               -1.226e-06                 6.207e-07  
##       NumCompaniesWorked         PercentSalaryHike  
##                1.430e-02                -4.303e-03  
##        PerformanceRating  RelationshipSatisfaction  
##                3.053e-02                -1.751e-02  
##         StockOptionLevel         TotalWorkingYears  
##               -5.513e-02                -3.103e-03  
##    TrainingTimesLastYear           WorkLifeBalance  
##               -1.661e-02                -2.893e-02  
##           YearsAtCompany        YearsInCurrentRole  
##                5.986e-03                -1.099e-02  
##  YearsSinceLastPromotion      YearsWithCurrManager  
##                1.167e-02                -1.193e-02
# Need to narrow down variables to check correlation